knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
library (readr)
library(stringr)
library(ggplot2)
library(plotly)
library(dplyr)
library(magrittr)
library(scales)
library(shiny)
library(kableExtra)
library(Hmisc)
library(glue)

urlfile="https://raw.githubusercontent.com/Bioinformatics-Research-Network/skill-assessments/main/R%20for%20Data%20Science/gapminder_clean.csv"

mydata<-read_csv(url(urlfile))

names(mydata) <-str_replace_all(names(mydata), c(" " = "." ))

 

This is a report of the analysis of gapminder_clean.csv data.

1 Analysis of data from year 1962

 

plot_1962 <- ggplot(mydata_1962, aes(x=`CO2.emissions.(metric.tons.per.capita)` ,y=gdpPercap)) + 
  geom_point(color = "firebrick") + 
  ggtitle("The correlation between CO2 emissions and GDP per capita in year 1962") + 
  labs(y= "GDP per capita", x = expression("CO2 emissions (metric tons per capita)")) +
  theme(axis.title.x = element_text(vjust = 0, size = 15),
        axis.title.y = element_text(vjust = 2, size = 15),
        axis.text = element_text(size = 10),
        plot.title = element_text(hjust = 0.5))

print(plot_1962)

 

Pearson’s correlation of CO2 emissions and GDP per capita in 1962 was calculated and the resuls are presented below.  

cor.test(mydata_1962$`CO2.emissions.(metric.tons.per.capita)`, mydata_1962$gdpPercap,)
## 
##  Pearson's product-moment correlation
## 
## data:  mydata_1962$`CO2.emissions.(metric.tons.per.capita)` and mydata_1962$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8934697 0.9489792
## sample estimates:
##       cor 
## 0.9260817

 

The correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap equals 0.9260817.

The associated p-value equals < 2.2e-16.

2 In what year is the correlation between CO2 emissions and GDP per capita the strongest?

 

all_years <- unique(mydata$Year)

year_cor_co2_gdp <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), 
                             c("Year", "Correlation"))

for (year in all_years) {
  subset <- mydata %>%
    filter(Year == year)
  correlation <- cor(subset$`CO2.emissions.(metric.tons.per.capita)`, 
                     subset$gdpPercap, use = "complete.obs")
  year_cor_co2_gdp[nrow(year_cor_co2_gdp) + 1,] <- c(year, correlation)
}

year_cor_co2_gdp[order(year_cor_co2_gdp$Correlation, decreasing = TRUE),] %>%
  kbl() %>%
  kable_material(c("striped", "hover"))
Year Correlation
2 1967 0.9387918
1 1962 0.9260817
3 1972 0.8428986
5 1982 0.8166384
6 1987 0.8095531
7 1992 0.8094316
8 1997 0.8081396
9 2002 0.8006421
4 1977 0.7928336
10 2007 0.7204169

 

The correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap is the strongest in the year 1967.  

mydata_1967 <- mydata %>%
  filter(Year == 1967)

plot_1967 <- ggplot(mydata_1967, aes(x=`CO2.emissions.(metric.tons.per.capita)` ,y=gdpPercap)) + 
  geom_point(aes(size = pop, colour = continent)) + 
  ggtitle("The correlation between CO2 emissions and GDP per capita in year 1967") + 
  labs(y= "GDP per capita", x = "CO2 emissions (metric tons per capita)") +
  theme(axis.title.x = element_text(vjust = 0, size = 15),
        axis.title.y = element_text(vjust = 2, size = 15),
        axis.text = element_text(size = 10)) 
div(ggplotly(plot_1967), align = "center")

3 What is the relationship between continent and energy use?

 

One-way ANOVA was chosen to determine what is the relationship between continent and energy use. It was chosen because the examined data had one categorical independent variable (continent) that had multiple levels (Asia, Europe, Africa, Americas, Oceania) and one quantitative dependent variable (energy use).

Results are presented below.

one_way_anova_1967 <- aov(`Energy.use.(kg.of.oil.equivalent.per.capita)` ~ continent, data = mydata_1967)

summary(one_way_anova_1967)
##             Df   Sum Sq  Mean Sq F value   Pr(>F)    
## continent    3 30161255 10053752   9.642 0.000334 ***
## Residuals   21 21895723  1042653                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 234 observations deleted due to missingness

 

The p-value is low (p < 0.001), it appears that depending on the continent, there is a difference in energy use.

4 Is there is a significant difference between Europe and Asia with respect to imports of goods and services in the years after 1990?

 

Because two groups (Europe and Asia) were being compared, t-test was chosen to assess whether there is a significant difference between Europe and Asia with respect to imports of goods and services in the years after 1990.

Results are presented below.

mydata_1990 <- mydata %>%
  filter(Year > 1990) %>%
  filter(continent == "Europe" | continent == "Asia")

ttest_1990 <- t.test(`Imports.of.goods.and.services.(%.of.GDP)` ~ continent, data = mydata_1990)

ttest_1990
## 
##  Welch Two Sample t-test
## 
## data:  Imports.of.goods.and.services.(%.of.GDP) by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
##  -2.321099 12.433240
## sample estimates:
##   mean in group Asia mean in group Europe 
##             46.84531             41.78924

 

Because of high (>0.05) p-value it is concluded that there is no significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990.

5 Which country (or countries) has the highest population density across all years?

 

pop_dens_avg <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), 
                         c("Country", "Average.population.density"))

all_countries <- unique(mydata$Country.Name)
  
for (country in all_countries) {
  subset <- mydata %>%
    filter(Country.Name == country)
  average <- mean(as.numeric(subset$`Population.density.(people.per.sq..km.of.land.area)`), na.rm=TRUE)
  pop_dens_avg[nrow(pop_dens_avg) + 1,] <- c(country, average)
}

pop_dens_avg$Average.population.density <- as.numeric(as.character(pop_dens_avg$Average.population.density))

pop_dens_avg <- pop_dens_avg[order(pop_dens_avg$Average.population.density, decreasing = TRUE),]  

head(pop_dens_avg) %>%
  kbl() %>%
  kable_material(c("striped", "hover"))
Country Average.population.density
145 Macao SAR, China 14732.035
163 Monaco 14089.900
101 Hong Kong SAR, China 5153.057
209 Singapore 4361.500
88 Gibraltar 2622.250
23 Bermuda 1132.780

 

Macao region in China has the highest ‘Population density (people per sq. km of land area)’ across all years. It is equal to 14732.035.

6 Which country (or countries) has shown the greatest increase in life expectancy at birth since 1962?

 

first_year <- head(all_years, n=1) 

last_year <- tail(all_years, n=1) 

print(glue("First measurment was taken in {first_year} and last one in {last_year}."))
## First measurment was taken in 1962 and last one in 2007.
mydata_2007 <- mydata %>%
  filter(Year == 2007)

exp_increase <- setNames(data.frame(matrix(ncol = 3, nrow = 0)), 
                         c("Country", "Life.exp.increase.numerical", 
                           "Life.exp.increase.percentage"))

for (country in all_countries) {
  subset_1962 <- mydata_1962 %>%
    filter(Country.Name == country)
  subset_2007 <- mydata_2007 %>%
    filter(Country.Name == country)
  increase_num <- subset_2007$`Life.expectancy.at.birth,.total.(years)` - subset_1962$`Life.expectancy.at.birth,.total.(years)`
  increase_perc <- round(subset_2007$`Life.expectancy.at.birth,.total.(years)` / subset_1962$`Life.expectancy.at.birth,.total.(years)`*100, digits=1)
  if (length(increase_num)==0) {
    increase_num <- NA
    increase_perc <- NA
  }
  exp_increase[nrow(exp_increase) + 1,] <- c(country, increase_num, increase_perc)
}

exp_increase$Life.exp.increase.numerical <- as.numeric(as.character(exp_increase$Life.exp.increase.numerical))

exp_increase$Life.exp.increase.percentage <- as.numeric(as.character(exp_increase$Life.exp.increase.percentage))

 

The table below shows table ordered in descending order by column containing numerical life expectancy increase.  

head(exp_increase[order(exp_increase$Life.exp.increase.numerical, decreasing = TRUE),]) %>%
  kbl() %>%
  kable_material(c("striped", "hover"))
Country Life.exp.increase.numerical Life.exp.increase.percentage
150 Maldives 36.91615 195.9
24 Bhutan 33.19895 200.3
238 Timor-Leste 31.08515 189.5
242 Tunisia 30.86076 171.2
182 Oman 30.82310 169.6
171 Nepal 30.59963 185.1

 

The table below shows table ordered in descending order by column containing percentage life expectancy increase.  

head(exp_increase[order(exp_increase$Life.exp.increase.percentage, decreasing = TRUE),]) %>%
  kbl() %>%
  kable_material(c("striped", "hover"))
Country Life.exp.increase.numerical Life.exp.increase.percentage
24 Bhutan 33.19895 200.3
150 Maldives 36.91615 195.9
151 Mali 25.71346 190.1
238 Timor-Leste 31.08515 189.5
171 Nepal 30.59963 185.1
84 Gambia, The 25.90834 179.3

 

In the Maldives life expectancy has grown by 37 years, what is a growth of 196%. In Bhutan life expectancy has grown by 33 years, what is over 200%.